function Est = gpu_RLA(Image,Kernel,Iter,Norm)
%% Description Richardson-Lucy deconvolution Algorithm
% fft(image,[],3) means fft in the 3rd direction of the array, ifft respectively
% the fft dimension must be adjusted according to your data array (x ->2 / y->1 / z->3)!

% Kernel has to be normalized to 1 in the direction of the fft
% (eg. Kernel=Kernel./sum(Kernel,3);)

% image and kernel have to be strictly positive for RLA to work propperly
% (eg. Kernel(Kernel<0)=0;)

% Image and Kernel have to have the same size

% due to the discrete fft the starting and the endpoint of the array have
% to be roughly matched to avoid discontinuities that lead to edge
% artefacts (matching with a cosine worked fine for me)

% for the fft to work fastest, the every dimension of the array should
% equal a power of two -> padding with cosine ;)
% after deconvolution just restore to original length (eg. Image=Image(:,:,1:OrgLenght);)

% after matching stat and end, the main feature of the kernel (eg. peak of gaussian) has to be be placed
% around the first array entry (-> phase = 0), otherwise this
% will shift the deconvolved spectrum and corrupt it
%( e.g. Image= circshift(Image, round(size(Image,3)/2), 3) )

% due to the discrete form of fft/ifft won't always be a real number -> sign restores signum -> abs the length of the complex number
% for higher dimensionality use fftn(Image) and globally norm the Kernel (Kernel=Kernel./sum(Kernel,'all');)

% -----------------------------------------------------------------------------------------
% iterative Formula :
% Est(E)_{n+1)}=Est_{n} .* (  image(E)./( Kernel(E)**Est_n(E) )  ** Kernel'(E) )

% (**) denotes convolution, Kernel'(E) means transposed Kernel -> can be
% done by flipping along fft direction or via complex conjugate of the fft
% -----------------------------------------------------------------------------------------

%% Start
Image         = gpuArray(Image);
Kernel        = gpuArray(Kernel);
Kernel        = repmat(Kernel(1,1,:),[size(Image,1),size(Image,2),1]);
Kernel        = Kernel./sum(Kernel,3);
Image(Image<0)=0;
Est           = Image;

%% FFT
FFTKernel     = fft(Kernel,[],3);
FFTFlipKernel = conj(FFTKernel);

%% Iterative Deconvolution
WaitTics = Iter;
WaitRLA = parfor_wait(WaitTics, 'Waitbar', true,'ReportInterval',1);
for iter = 1:Iter
    B   = fft(Est,[],3);
    B   = FFTKernel .* B;
    B   = ifft(B,[],3);
    B   = sign(real(B)).*abs(B);
    
    A   = Image;
    A(B~=0) = A(B~=0) ./ B(B~=0);
    clearvars B
    
    A   = fft(A,[],3);
    A   =  A.* FFTFlipKernel;
    A   = ifft(A,[],3);
    A   = sign(real(A)).*abs(A);

    Est = Est.*A;
    clearvars A

    WaitRLA.Send;
end
WaitRLA.Destroy;
Est(isnan(Est))      = 0;

Est                  = gather(Est./Norm); 
clearvars -except Est
end